from pyspark.context import SparkContext

sc = SparkContext(master="local", appName="word_count")

students_rdd = sc.textFile('../../data/students.txt')

kv_rdd = students_rdd.map(lambda x:(x.split(',')[-1],int(x.split(',')[2])))

group_by_key_rdd = kv_rdd.groupByKey()

def avg_fun(kv):
    clazz = kv[0]
    ages = kv[1]
    
    avg_age = round(sum(ages)/len(ages),2)

    return clazz, avg_age

avg_age_rdd = group_by_key_rdd.map(avg_fun)

avg_age_rdd.foreach(print)